Exploratory notebook related to the theory and concepts behind Style Transfer using CNNs. Includes toy examples implementation and visualization.
About the generation of new images by weighted combination of a target visual style and a target semantic content. The process tries to optimize both style and content by refining the input data; it generally uses information extracted from internal layer of an already trained CNN to obtain a representation of the style component.
In [ ]:
from __future__ import print_function
import time
from PIL import Image
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras import backend as K
from keras.models import Model
from keras import metrics
from keras.applications.vgg16 import VGG16
import scipy
from scipy.optimize import fmin_l_bfgs_b
from scipy.misc import imsave
#backend.set_image_data_format('channels_last')
#keras.backend.set_image_dim_ordering('tf')
import os
import sys
sys.path.append(os.path.join(os.getcwd(), os.pardir))
from utils.vgg_utils import preprocess, deprocess, gram_matrix
from utils.vgg16_avg import VGG16_Avg
RES_DIR = os.path.join('resources')
%load_ext autoreload
%autoreload 2
In [ ]:
height = 256
width = 256
In [ ]:
# load content image
content_image = None
with Image.open(os.path.join(RES_DIR, 'superman.jpg')) as img:
img = img.resize((height, width))
content_image = np.asarray(img, dtype='float32')
plt.imshow(img.convert(mode='RGB'))
plt.show()
In [ ]:
# load style image
style_image = None
with Image.open(os.path.join(RES_DIR, 'comics_style.jpg')) as img:
img = img.resize((height, width))
style_image = np.asarray(img, dtype='float32')
plt.imshow(img.convert(mode='RGB'))
plt.show()
In [ ]:
content_image.shape
In this first step I am going to simply recreate an image from noise using the content loss.
In [ ]:
# define input image
img_arr = preprocess(np.expand_dims(style_image, axis=0))
#img_arr = preproc(np.expand_dims(np.array(Image.open(os.path.join(RES_DIR, 'simpsons_style.jpg'))), axis=0))
shp = img_arr.shape
print(shp)
In [ ]:
# get VGG model
model = VGG16(include_top=False)
In [ ]:
# define layer model (VGG model input and intermediate layer output)
layer = model.get_layer('block5_conv1').output
layer_model = Model(model.input, layer)
targ = K.variable(layer_model.predict(img_arr))
In [ ]:
# define our loss and gradients
loss = metrics.mse(layer, targ)
grads = K.gradients(loss, model.input)
fn = K.function([model.input], [loss]+grads)
In [ ]:
# utility function to hold loss and gradients
class Evaluator(object):
def __init__(self, f, shp): self.f, self.shp = f, shp
def loss(self, x):
loss_, self.grad_values = self.f([x.reshape(self.shp)])
return loss_.astype(np.float64)
def grads(self, x): return self.grad_values.flatten().astype(np.float64)
In [ ]:
evaluator = Evaluator(fn, shp)
In [ ]:
# run optimization process and save result image at each iteration
def solve_image(eval_obj, iterations, x, img_shape, dest_dir=''):
for i in range(iterations):
start_time = time.time()
x, min_val, info = fmin_l_bfgs_b(eval_obj.loss, x.flatten(),
fprime=eval_obj.grads, maxfun=20)
x = np.clip(x, -127,127)
print('Current loss value:', min_val)
end_time = time.time()
print('Iteration {} completed in {:.1f}s'.format(i, end_time - start_time))
img = deprocess(x.copy(), img_shape)[0]
img_filepath = os.path.join(dest_dir, "res_at_iteration_{}.png".format(i))
imsave(img_filepath, img)
return x
In [ ]:
x = np.random.uniform(-2.5, 2.5, shp)
#x = np.random.uniform(0, 255, shp) - 128.
plt.imshow(x[0]);
In [ ]:
x = solve_image(evaluator, 5, x, 'recreate_input')
In [ ]:
plt.imshow(deproc(x,shp)[0].astype('uint8'))
While in previous section we recreated the input from noise, here we are actually recreating the style from noise.
In [ ]:
# load and process input content
style_arr = preprocess(np.expand_dims(style_image, axis=0)[:,:,:,:3])
shp = style_arr.shape
print(shp)
In [ ]:
# get VGG model
#model = VGG16(include_top=False, pooling='avg', input_shape=shp[1:]) #input_tensor=input_tensor
model = VGG16_Avg(include_top=False, input_shape=shp[1:])
In [ ]:
model.summary()
In [ ]:
model.summary()
In [ ]:
outputs = {l.name: l.output for l in model.layers}
layers = [outputs['block{}_conv1'.format(o)] for o in range(1,3)]
In [ ]:
layers_model = Model(model.input, layers)
targs = [K.variable(o) for o in layers_model.predict(style_arr)]
In [ ]:
def style_loss(x, targ):
return metrics.mse(gram_matrix(x), gram_matrix(targ))
#S = gram_matrix(style)
#C = gram_matrix(combination)
#channels = 3
#size = height * width
#return K.sum(K.square(S - C)) / (4. * (channels ** 2) * (size ** 2))
In [ ]:
loss = sum(style_loss(l1[0], l2[0]) for l1,l2 in zip(layers, targs))
grads = K.gradients(loss, model.input)
style_fn = K.function([model.input], [loss]+grads)
evaluator = Evaluator(style_fn, shp)
In [ ]:
rand_img = lambda shape: np.random.uniform(-2.5, 2.5, shape)/1
x = rand_img(shp)
#x = scipy.ndimage.filters.gaussian_filter(x, [0,2,2,0])
In [ ]:
plt.imshow(x[0]);
In [ ]:
iterations=10
x = rand_img(shp)
x = solve_image(evaluator, iterations, x, folder_name='recreate_style')
Here we are finally use both the content and style images to operate the style transfer task.
In [ ]:
# load and process input content
content_arr = preprocess(np.expand_dims(content_image, axis=0))
style_arr = preprocess(np.expand_dims(style_image, axis=0))
shp = content_arr.shape
print(content_arr.shape)
print(style_arr.shape)
In [ ]:
# get VGG model
# later versions of Keras use pooling='avg'
model = VGG16(include_top=False, input_shape=shp[1:])
In [ ]:
outputs = {l.name: l.output for l in model.layers}
style_layers = [outputs['block{}_conv2'.format(o)] for o in range(1,6)]
content_name = 'block4_conv2'
content_layer = outputs[content_name]
In [ ]:
style_model = Model(model.input, style_layers)
style_targs = [K.variable(o) for o in style_model.predict(style_arr)]
In [ ]:
content_model = Model(model.input, content_layer)
content_targ = K.variable(content_model.predict(content_arr))
In [ ]:
style_wgts = [0.05,0.2,0.2,0.25,0.3]
In [ ]:
loss = sum(style_loss(l1[0], l2[0])*w
for l1,l2,w in zip(style_layers, style_targs, style_wgts))
loss += metrics.mse(content_layer, content_targ)/2
grads = K.gradients(loss, model.input)
transfer_fn = K.function([model.input], [loss]+grads)
In [ ]:
evaluator = Evaluator(transfer_fn, shp)
In [ ]:
iterations=10
x = np.random.uniform(-2.5, 2.5, shp)
plt.imshow(x[0]);
In [ ]:
x = solve_image(evaluator, iterations, x, shp, dest_dir=os.path.join('results', 'style_transfer'))
See Keras example
Feed concatenation of images directly to the network. The previous approach builds two different models and combines their loss.
In [ ]:
feature_layers = ['block1_conv2', 'block2_conv2',
'block3_conv3', 'block4_conv3',
'block5_conv3']
for layer_name in feature_layers:
layer_features = layers[layer_name]
style_features = layer_features[1, :, :, :]
combination_features = layer_features[2, :, :, :]
sl = style_loss(style_features, combination_features)
loss += (style_weight / len(feature_layers)) * sl
In [ ]:
content_image = backend.variable(content_array)
style_image = backend.variable(style_array)
combination_image = backend.placeholder((1, height, width, 3))
#if backend.image_data_format() == 'channels_first':
# combination_image = backend.placeholder((1, 3, height, width))
#else:
# combination_image = backend.placeholder((1, height, width, 3))
input_tensor = backend.concatenate([content_image,
style_image,
combination_image], axis=0)
In [ ]:
content_weight = 0.025
style_weight = 5.0
total_variation_weight = 1.0
In [ ]:
loss = backend.variable(0.)
In [ ]:
layer_features = layers['block2_conv2']
content_image_features = layer_features[0, :, :, :]
combination_features = layer_features[2, :, :, :]
loss += content_weight * content_loss(content_image_features,
combination_features)
In [ ]:
def total_variation_loss(x):
a = backend.square(x[:, :height-1, :width-1, :] - x[:, 1:, :width-1, :])
b = backend.square(x[:, :height-1, :width-1, :] - x[:, :height-1, 1:, :])
return backend.sum(backend.pow(a + b, 1.25))
loss += total_variation_weight * total_variation_loss(combination_image)
In [ ]:
grads = backend.gradients(loss, combination_image)
In [ ]:
outputs = [loss]
outputs += grads
f_outputs = backend.function([combination_image], outputs)
def eval_loss_and_grads(x):
x = x.reshape((1, height, width, 3))
outs = f_outputs([x])
loss_value = outs[0]
if len(outs[1:]) == 1:
grad_values = outs[1].flatten().astype('float64')
else:
grad_values = np.array(outs[1:]).flatten().astype('float64')
return loss_value, grad_values
In [ ]:
x = np.random.uniform(0, 255, (1, height, width, 3)) - 128.